import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
df= pd.read_csv('D:/Global DPI.csv')
df.head()
| Country | AveragScore | SafetySecurity | PersonelFreedom | Governance | SocialCapital | InvestmentEnvironment | EnterpriseConditions | MarketAccessInfrastructure | EconomicQuality | LivingConditions | Health | Education | NaturalEnvironment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Denmark | 84.55 | 92.59 | 94.09 | 89.45 | 82.56 | 82.42 | 79.64 | 78.79 | 76.81 | 95.77 | 81.07 | 87.48 | 73.94 |
| 1 | Sweden | 83.67 | 90.97 | 91.90 | 86.41 | 78.29 | 82.81 | 75.54 | 79.67 | 76.18 | 95.33 | 82.28 | 85.92 | 78.74 |
| 2 | Norway | 83.59 | 93.30 | 94.10 | 89.66 | 79.03 | 82.24 | 75.95 | 75.87 | 77.25 | 94.70 | 82.98 | 85.68 | 72.37 |
| 3 | Finland | 83.47 | 89.56 | 91.96 | 90.41 | 77.27 | 84.12 | 77.25 | 78.77 | 70.28 | 94.46 | 81.19 | 88.38 | 77.99 |
| 4 | Switzerland | 83.42 | 95.66 | 87.50 | 87.67 | 69.14 | 80.81 | 83.84 | 78.65 | 79.71 | 94.66 | 82.11 | 87.72 | 73.60 |
df.isnull().sum()
Country 0 AveragScore 0 SafetySecurity 0 PersonelFreedom 0 Governance 0 SocialCapital 0 InvestmentEnvironment 0 EnterpriseConditions 0 MarketAccessInfrastructure 0 EconomicQuality 0 LivingConditions 0 Health 0 Education 0 NaturalEnvironment 0 dtype: int64
df.describe()
df.info()
df.duplicated().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 167 entries, 0 to 166 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 167 non-null object 1 AveragScore 167 non-null float64 2 SafetySecurity 167 non-null float64 3 PersonelFreedom 167 non-null float64 4 Governance 167 non-null float64 5 SocialCapital 167 non-null float64 6 InvestmentEnvironment 167 non-null float64 7 EnterpriseConditions 167 non-null float64 8 MarketAccessInfrastructure 167 non-null float64 9 EconomicQuality 167 non-null float64 10 LivingConditions 167 non-null float64 11 Health 167 non-null float64 12 Education 167 non-null float64 13 NaturalEnvironment 167 non-null float64 dtypes: float64(13), object(1) memory usage: 18.4+ KB
0
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.drop("Country", axis=1)), columns=df.drop("Country", axis=1).columns)
plt.figure(figsize=(10, 8))
corr = df.drop("Country", axis=1).corr()
sns.heatmap(corr, annot=True, cmap='bwr', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
cols = ['AveragScore', 'SafetySecurity', 'PersonelFreedom',
'Governance', 'SocialCapital', 'InvestmentEnvironment',
'EnterpriseConditions', 'MarketAccessInfrastructure', 'EconomicQuality',
'LivingConditions', 'Health', 'Education', 'NaturalEnvironment']
df.AveragScore.describe()
count 167.000000 mean 58.056228 std 13.309964 min 30.400000 25% 47.770000 50% 57.530000 75% 66.860000 max 84.550000 Name: AveragScore, dtype: float64
ax = sns.boxplot(data=df['AveragScore'], orient='v', width=0.2)
ax.figure.set_size_inches(6, 5)
ax.set_title('AvgScore', fontsize=18)
ax.set_xlabel('AvgScore', fontsize=10)
plt.show()
import plotly.express as px
for i in cols:
if i == 'AveragScore':
top_10 = pd.DataFrame(df.groupby('Country')[['Country','AveragScore']].sum().sort_values('AveragScore', ascending=False).round(2).head(10))
fig = px.bar(top_10, x = top_10.index, y = 'AveragScore',
title = 'Top 10 Countries by AveragScore', template = 'seaborn', color = top_10.index, text = 'AveragScore')
fig.show()
top_10
else:
top_10 = pd.DataFrame(df.groupby('Country')[['Country',i]].sum().sort_values(i, ascending=False).round(2).head(10))
fig = px.bar(top_10, x = top_10.index, y = i,
title = 'Top 10 Countries by '+ i, template = 'seaborn', color = top_10.index, text = i)
fig.show()
top_10
for i in cols:
if i == 'AveragScore':
char_bar = df.groupby(['Country'])[['AveragScore']].sum().reset_index()
char_bar = char_bar.sort_values(by=("AveragScore"), ascending=True)
top = char_bar.head(10)
fig = go.Figure()
fig.add_trace(go.Bar(x=top['Country'], y=top["AveragScore"]))
fig.update_layout(title='Lowest Countries According to AveragScore',
xaxis_title='Country',
yaxis_title= "AveragScore",
plot_bgcolor='#F0EEED',
paper_bgcolor='#F0EEED',
font=dict(color='black'))
pyo.init_notebook_mode(connected=True)
pyo.iplot(fig)
else:
char_bar = df.groupby(['Country'])[[i]].sum().reset_index()
char_bar = char_bar.sort_values(by=(i), ascending=True)
top = char_bar.head(10)
fig = go.Figure()
fig.add_trace(go.Bar(x=top['Country'], y=top[i]))
fig.update_layout(title='Lowest Countries According to '+ i,
xaxis_title='Country',
yaxis_title= i,
plot_bgcolor='#F0EEED',
paper_bgcolor='#F0EEED',
font=dict(color='black'))
pyo.init_notebook_mode(connected=True)
pyo.iplot(fig)
import plotly.express as px
for i in cols:
if i == 'AveragScore':
fig = px.choropleth(df,locations='Country', color = i, locationmode='country names',title = f'{i} - Choropleth',color_continuous_scale='Viridis_r')
fig.show()
fig.write_html(f"geo-{i}.html")
else:
fig = px.choropleth(df,locations='Country', color = i, locationmode='country names',title = f'{i} - Choropleth',color_continuous_scale='Viridis_r')
fig.show()
fig.write_html(f"geo-{i}.html")
## K MEANS
df = df.set_index('Country')
!pip install yellowbrick
Collecting yellowbrick Obtaining dependency information for yellowbrick from https://files.pythonhosted.org/packages/06/35/c7d44bb541c06bc41b3239b27af79ea0ecc7dbb156ee1335576f99c58b91/yellowbrick-1.5-py3-none-any.whl.metadata Downloading yellowbrick-1.5-py3-none-any.whl.metadata (7.7 kB) Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (3.7.2) Requirement already satisfied: scipy>=1.0.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (1.11.1) Requirement already satisfied: scikit-learn>=1.0.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (1.3.0) Requirement already satisfied: numpy>=1.16.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (1.24.3) Requirement already satisfied: cycler>=0.10.0 in c:\users\jatin\anaconda3\lib\site-packages (from yellowbrick) (0.11.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.0.5) Requirement already satisfied: fonttools>=4.22.0 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.4.0) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\jatin\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2) Requirement already satisfied: joblib>=1.1.1 in c:\users\jatin\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.2.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\jatin\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (2.2.0) Requirement already satisfied: six>=1.5 in c:\users\jatin\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0) Downloading yellowbrick-1.5-py3-none-any.whl (282 kB) ---------------------------------------- 0.0/282.6 kB ? eta -:--:-- ---------------------------------------- 0.0/282.6 kB ? eta -:--:-- -- ------------------------------------ 20.5/282.6 kB 330.3 kB/s eta 0:00:01 -------------------- ------------------- 143.4/282.6 kB 1.7 MB/s eta 0:00:01 ---------------------------------------- 282.6/282.6 kB 2.5 MB/s eta 0:00:00 Installing collected packages: yellowbrick Successfully installed yellowbrick-1.5
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score
import os
os.environ['OMP_NUM_THREADS'] = '1'
plt.figure(figsize=(12, 8))
elbow_graph = KElbowVisualizer(KMeans(random_state=123), k=10)
elbow_graph.fit(df)
elbow_graph.show()
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
agrupador = KMeans(n_clusters = 3)
agrupador.fit(df)
labels = agrupador.labels_
print(labels)
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 0 2 0 0 2 2 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
agrupador = KMeans(n_clusters = 3)
agrupador.fit(df)
labels = agrupador.labels_
labels
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2,
2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
from sklearn.metrics import silhouette_samples, silhouette_score
range_n_clusters = [i for i in range(2,10)]
print(range_n_clusters)
[2, 3, 4, 5, 6, 7, 8, 9]
from sklearn.cluster import KMeans
valores_silhueta = []
for k in range_n_clusters:
agrupador = KMeans(n_clusters=k)
labels = agrupador.fit_predict(df)
media_silhueta = silhouette_score(df, labels)
valores_silhueta.append(media_silhueta)
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1. C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
agrupador_kmeans = KMeans(n_clusters = 3)
labels_kmeans = agrupador_kmeans.fit_predict(df)
print("Labels K-means: ", labels_kmeans)
C:\Users\Jatin\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
Labels K-means: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
print("The K-means silhouette coefficient is:", silhouette_score(df, labels_kmeans))
The K-means silhouette coefficient is: 0.38133235294176215
df["cluster"] = labels_kmeans
df.groupby("cluster").describe()
| AveragScore | SafetySecurity | ... | Education | NaturalEnvironment | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| cluster | |||||||||||||||||||||
| 0 | 41.0 | 76.649024 | 4.683675 | 68.24 | 72.99 | 77.31 | 80.31 | 84.55 | 41.0 | 86.537561 | ... | 85.19 | 91.44 | 41.0 | 67.135610 | 6.309125 | 53.20 | 63.23 | 68.58 | 71.71 | 78.74 |
| 1 | 57.0 | 43.884737 | 5.448067 | 30.40 | 41.87 | 44.67 | 47.87 | 53.68 | 57.0 | 52.389649 | ... | 44.52 | 61.10 | 57.0 | 51.321053 | 6.483988 | 33.67 | 48.46 | 52.69 | 56.10 | 62.22 |
| 2 | 69.0 | 58.715217 | 4.231192 | 47.71 | 55.91 | 58.56 | 61.27 | 66.88 | 69.0 | 68.047101 | ... | 69.19 | 81.87 | 69.0 | 53.809275 | 6.674120 | 40.27 | 48.56 | 53.92 | 58.64 | 69.35 |
3 rows × 104 columns
df.reset_index(level=0, inplace=True)
fig = px.choropleth(df,locations='Country', color = 'cluster', locationmode='country names',title = f'Cluster - Choropleth',color_continuous_scale='Rainbow')
fig.update_layout(margin={'r':0,'t':0,'l':0,'b':0}, coloraxis_colorbar=dict(
title = 'Cluster',
ticks = 'outside',
tickvals = [0,1,2],
dtick = 12))
fig.show()